In [1]:
import nltk;
import string;
import math;
import csv;

Collocations

Benjamin Bray

(this notebook requires NLTK)

Text Preprocessing

First, we preprocess the text document by

  • converting to lowercase
  • removing punctuation
  • counting unigrams and bigrams
  • saving unigram and bigram counts to a file

In [2]:
# read text file
text_path = "data/crime-and-punishment.txt";
with open(text_path) as f:
    text_raw = f.read().lower();

# remove punctuation
translate_table = dict((ord(char), None) for char in string.punctuation);
text_raw = text_raw.translate(translate_table);

# tokenize
tokens = nltk.word_tokenize(text_raw);
bigrams = nltk.bigrams(tokens);

# unigram/bigram frequencies
unigram_counts = nltk.FreqDist(tokens);
bigram_counts = nltk.FreqDist(bigrams);

# write to file
unigram_path = text_path + ".unigrams";
bigram_path = text_path + ".bigrams";

with open(unigram_path, "w") as f:
    writer = csv.writer(f);
    filtered = [ (w,c) for w,c in unigram_counts.items() if c > 1];
    writer.writerows(filtered);
    
with open(bigram_path, "w") as f:
    writer = csv.writer(f);
    filtered = [ (b[0], b[1],c) for b,c in bigram_counts.items() if c > 3];
    writer.writerows(filtered);

Most Common Words & Phrases

Here are the top few most common words:


In [3]:
unigram_counts.most_common(20)


Out[3]:
[('the', 7807),
 ('and', 6902),
 ('to', 5266),
 ('he', 4657),
 ('a', 4568),
 ('i', 3939),
 ('you', 3807),
 ('of', 3806),
 ('in', 3188),
 ('it', 2973),
 ('that', 2913),
 ('was', 2820),
 ('his', 2115),
 ('at', 2064),
 ('her', 1823),
 ('but', 1780),
 ('not', 1778),
 ('with', 1706),
 ('for', 1648),
 ('she', 1628)]

Below are the most commmon word pairs. These aren't collocations!


In [4]:
bigram_counts.most_common(20)


Out[4]:
[(('in', 'the'), 778),
 (('of', 'the'), 598),
 (('he', 'was'), 505),
 (('he', 'had'), 498),
 (('to', 'the'), 488),
 (('on', 'the'), 479),
 (('i', 'am'), 460),
 (('at', 'the'), 459),
 (('it', 'was'), 413),
 (('that', 'he'), 335),
 (('you', 'are'), 326),
 (('to', 'be'), 308),
 (('in', 'a'), 307),
 (('do', 'you'), 292),
 (('with', 'a'), 264),
 (('did', 'not'), 256),
 (('was', 'a'), 249),
 (('for', 'the'), 246),
 (('at', 'once'), 244),
 (('and', 'he'), 241)]

Collocations

To find collocations, we sort pairs of words by their pointwise mutual information, $$ \mathrm{pmi}(x;y) = \log \frac{p(x,y)}{p(x)p(y)} $$


In [5]:
# compute pmi
pmi_bigrams = [];

for bigram,_ in bigram_counts.most_common(1000):
    w1, w2 = bigram;
    
    # compute pmi
    actual = bigram_counts[bigram];
    expected = unigram_counts[w1] * unigram_counts[w2];
    pmi = math.log( actual / expected );
    
    pmi_bigrams.append( (w1, w2, pmi) );

# sort pmi
pmi_sorted = sorted(pmi_bigrams, key=lambda x: x[2], reverse=True);

Here are the top 30 collocations according to PMI:


In [6]:
pmi_sorted[:30]


Out[6]:
[('nikodim', 'fomitch', -3.1780538303479458),
 ('andrey', 'semyonovitch', -3.1780538303479458),
 ('dmitri', 'prokofitch', -3.871201010907891),
 ('sofya', 'semyonovna', -4.330733340286331),
 ('marfa', 'petrovna', -4.37158498596076),
 ('rodion', 'romanovitch', -4.574710978503383),
 ('avdotya', 'romanovna', -4.74493212836325),
 ('pulcheria', 'alexandrovna', -4.820281565605037),
 ('great', 'deal', -5.2805000013568755),
 ('good', 'heavens', -5.509550266836412),
 ('katerina', 'ivanovnas', -5.569434422125123),
 ('ilya', 'petrovitch', -5.636573724962751),
 ('pyotr', 'petrovitch', -5.665343459987014),
 ('katerina', 'ivanovna', -5.731418449229828),
 ('amalia', 'ivanovna', -5.87493073085203),
 ('make', 'haste', -5.961996125397379),
 ('each', 'other', -6.009777970852694),
 ('head', 'clerk', -6.170751200206783),
 ('old', 'woman', -6.17264389932316),
 ('any', 'case', -6.267612970702526),
 ('sat', 'down', -6.283876793935164),
 ('long', 'ago', -6.322955453378991),
 ('sit', 'down', -6.343232735746065),
 ('porfiry', 'petrovitch', -6.519492016470095),
 ('an', 'hour', -6.645639872435834),
 ('no', 'doubt', -6.7665084847599015),
 ('young', 'man', -6.980870198698654),
 ('let', 'us', -7.152727800737759),
 ('my', 'dear', -7.185930346210463),
 ('excuse', 'me', -7.317169568047098)]

Just for fun, here are the bottom 30 collocations according to PMI. These are the word pairs that occur together less frequently than expected:


In [7]:
pmi_sorted[-30:]


Out[7]:
[('was', 'that', -12.663334382870111),
 ('you', 'that', -12.666187451852519),
 ('and', 'for', -12.691536152637951),
 ('her', 'he', -12.696268979639559),
 ('have', 'the', -12.711844297245333),
 ('had', 'he', -12.768049453586553),
 ('you', 'to', -12.770571828604202),
 ('it', 'you', -12.776187676580868),
 ('as', 'the', -12.792253213656782),
 ('you', 'and', -12.80749714102439),
 ('it', 'it', -12.903611192637879),
 ('but', 'to', -12.917900767413236),
 ('and', 'it', -12.94507302854542),
 ('that', 'in', -12.953045343511224),
 ('you', 'you', -12.962832988148833),
 ('to', 'a', -12.98852390615963),
 ('had', 'the', -13.071124656426408),
 ('that', 'a', -13.071564998120076),
 ('you', 'in', -13.092869971424403),
 ('to', 'it', -13.132365958569206),
 ('to', 'that', -13.178669267029216),
 ('it', 'a', -13.205281749465744),
 ('you', 'he', -13.224987596565148),
 ('and', 'to', -13.331055399808484),
 ('i', 'to', -13.35120094156068),
 ('that', 'and', -13.597629435749182),
 ('you', 'a', -13.62690506999492),
 ('it', 'the', -13.702006331096907),
 ('and', 'of', -13.71660472152757),
 ('you', 'the', -13.773385598017642)]

Reading from CSV

Here I'm just testing out reading from the CSV files I created:


In [8]:
unigram_path = "data/crime-and-punishment.txt.unigrams";
bigram_path = "data/crime-and-punishment.txt.bigrams";

with open(unigram_path) as f:
    reader = csv.reader(f);
    unigrams = { row[0] : int(row[1]) for row in csv.reader(f)}
    
with open(bigram_path) as f:
    reader = csv.reader(f);
    bigrams = { (row[0],row[1]) : int(row[2]) for row in csv.reader(f)}

In [9]:
bigrams


Out[9]:
{('to', 'hell'): 5,
 ('room', 'the'): 13,
 ('people', 'to'): 4,
 ('the', 'assistant'): 10,
 ('so', 'soon'): 5,
 ('get', 'out'): 5,
 ('and', 'seemed'): 8,
 ('the', 'murder'): 24,
 ('gazed', 'at'): 20,
 ('stairs', 'to'): 5,
 ('discuss', 'it'): 4,
 ('me', 'all'): 5,
 ('an', 'axe'): 12,
 ('to', 'treat'): 4,
 ('in', 'very'): 7,
 ('i', 'look'): 9,
 ('why', 'who'): 4,
 ('now', 'not'): 4,
 ('a', 'time'): 26,
 ('among', 'them'): 9,
 ('direction', 'of'): 9,
 ('and', 'never'): 4,
 ('last', 'week'): 6,
 ('the', 'palais'): 6,
 ('he', 'turned'): 47,
 ('known', 'it'): 4,
 ('on', 'her'): 46,
 ('what', 'made'): 12,
 ('and', 'out'): 4,
 ('and', 'for'): 35,
 ('to', 'zossimov'): 5,
 ('him', 'but'): 41,
 ('why', 'he'): 20,
 ('a', 'cup'): 7,
 ('wont', 'have'): 5,
 ('her', 'work'): 4,
 ('that', 'minute'): 7,
 ('go', 'on'): 32,
 ('her', 'voice'): 10,
 ('possibility', 'of'): 7,
 ('so', 'isnt'): 5,
 ('your', 'eyes'): 5,
 ('asked', 'in'): 10,
 ('why', 'is'): 14,
 ('for', 'half'): 8,
 ('knew', 'it'): 10,
 ('on', 'that'): 24,
 ('waked', 'up'): 9,
 ('three', 'roubles'): 8,
 ('what', 'right'): 5,
 ('you', 'again'): 5,
 ('everyone', 'to'): 4,
 ('or', 'i'): 7,
 ('in', 'two'): 10,
 ('end', 'to'): 8,
 ('petrovna', 'and'): 5,
 ('did', 'they'): 4,
 ('the', 'empty'): 7,
 ('were', 'looking'): 6,
 ('in', 'that'): 64,
 ('with', 'sonia'): 7,
 ('door', 'i'): 4,
 ('all', 'his'): 36,
 ('suspicion', 'and'): 4,
 ('it', 'no'): 6,
 ('and', 'where'): 15,
 ('mother', 'has'): 5,
 ('you', 'ive'): 6,
 ('here', 'he'): 22,
 ('hand', 'but'): 5,
 ('we', 'could'): 5,
 ('there', 'may'): 4,
 ('illness', 'and'): 4,
 ('of', 'money'): 11,
 ('aloud', 'and'): 4,
 ('full', 'possession'): 6,
 ('some', 'moments'): 4,
 ('be', 'over'): 4,
 ('ought', 'not'): 11,
 ('under', 'his'): 13,
 ('is', 'open'): 4,
 ('perhaps', 'have'): 4,
 ('so', 'a'): 4,
 ('nerves', 'were'): 5,
 ('asked', 'for'): 7,
 ('simply', 'from'): 9,
 ('have', 'anything'): 4,
 ('his', 'visit'): 7,
 ('and', 'turning'): 6,
 ('a', 'mother'): 5,
 ('beside', 'himself'): 4,
 ('are', 'an'): 9,
 ('its', 'only'): 10,
 ('know', 'your'): 6,
 ('at', 'their'): 10,
 ('smile', 'but'): 6,
 ('rushing', 'to'): 4,
 ('certainly', 'did'): 5,
 ('but', 'we'): 24,
 ('rodya', 'rodya'): 5,
 ('yourself', 'that'): 7,
 ('the', 'hospital'): 14,
 ('again', 'in'): 13,
 ('mother', 'who'): 4,
 ('to', 'follow'): 6,
 ('me', 'very'): 5,
 ('expected', 'that'): 5,
 ('told', 'a'): 6,
 ('fancy', 'i'): 5,
 ('the', 'inner'): 4,
 ('i', 'ought'): 16,
 ('head', 'of'): 5,
 ('her', 'a'): 20,
 ('her', 'mouth'): 8,
 ('was', 'pale'): 7,
 ('strange', 'to'): 21,
 ('of', 'some'): 19,
 ('a', 'special'): 12,
 ('in', 'perplexity'): 6,
 ('observed', 'that'): 4,
 ('rushed', 'to'): 19,
 ('the', 'night'): 18,
 ('yet', 'he'): 11,
 ('you', 'cried'): 6,
 ('you', 'may'): 46,
 ('door', 'of'): 10,
 ('bring', 'him'): 5,
 ('wont', 'you'): 8,
 ('oh', 'how'): 8,
 ('well', 'have'): 8,
 ('cinq', 'sous'): 6,
 ('her', 'how'): 6,
 ('too', 'little'): 4,
 ('face', 'in'): 9,
 ('good', 'manners'): 4,
 ('in', 'common'): 4,
 ('different', 'matter'): 7,
 ('she', 'cried'): 30,
 ('by', 'some'): 6,
 ('got', 'on'): 6,
 ('and', 'read'): 7,
 ('every', 'day'): 10,
 ('it', 'he'): 73,
 ('his', 'linen'): 6,
 ('is', 'rather'): 4,
 ('ivanovna', 'you'): 8,
 ('said', 'nothing'): 12,
 ('by', 'my'): 6,
 ('timidly', 'at'): 4,
 ('on', 'living'): 11,
 ('with', 'lizaveta'): 4,
 ('they', 'dont'): 18,
 ('sit', 'down'): 38,
 ('went', 'away'): 10,
 ('send', 'for'): 5,
 ('so', 'ill'): 4,
 ('if', 'the'): 11,
 ('frightened', 'by'): 4,
 ('have', 'made'): 12,
 ('know', 'its'): 4,
 ('said', 'at'): 7,
 ('and', 'fixed'): 4,
 ('dont', 'say'): 8,
 ('it', 'right'): 4,
 ('honest', 'and'): 7,
 ('it', 'seems'): 11,
 ('no', 'reply'): 9,
 ('you', 'both'): 5,
 ('been', 'here'): 14,
 ('said', 'goodbye'): 4,
 ('it', 'has'): 18,
 ('to', 'beat'): 5,
 ('if', 'so'): 4,
 ('kept', 'his'): 5,
 ('now', 'that'): 22,
 ('of', 'your'): 57,
 ('find', 'that'): 5,
 ('quickly', 'and'): 8,
 ('just', 'going'): 4,
 ('due', 'to'): 5,
 ('a', 'pause'): 5,
 ('a', 'free'): 5,
 ('only', 'one'): 11,
 ('with', 'horror'): 9,
 ('me', 'yesterday'): 4,
 ('crowd', 'of'): 8,
 ('my', 'mother'): 23,
 ('and', 'stared'): 8,
 ('open', 'the'): 12,
 ('as', 'a'): 87,
 ('couldnt', 'do'): 4,
 ('his', 'spine'): 5,
 ('how', 'the'): 6,
 ('into', 'a'): 45,
 ('not', 'merely'): 5,
 ('he', 'waked'): 4,
 ('way', 'or'): 6,
 ('since', 'yesterday'): 5,
 ('sensation', 'he'): 5,
 ('this', 'time'): 32,
 ('me', 'you'): 18,
 ('answer', 'for'): 6,
 ('his', 'side'): 4,
 ('but', 'only'): 10,
 ('is', 'this'): 6,
 ('shaking', 'his'): 4,
 ('so', 'be'): 6,
 ('blurted', 'out'): 10,
 ('from', 'behind'): 4,
 ('you', 'do'): 25,
 ('we', 'must'): 20,
 ('with', 'her'): 70,
 ('take', 'her'): 8,
 ('honour', 'i'): 4,
 ('the', 'colour'): 4,
 ('came', 'out'): 22,
 ('almost', 'with'): 11,
 ('drew', 'a'): 5,
 ('or', 'four'): 9,
 ('course', 'you'): 7,
 ('began', 'with'): 6,
 ('upon', 'him'): 28,
 ('to', 'raskolnikov'): 29,
 ('out', 'but'): 6,
 ('you', 'everything'): 4,
 ('as', 'it'): 72,
 ('all', 'who'): 6,
 ('of', 'service'): 5,
 ('the', 'policeman'): 18,
 ('and', 'found'): 5,
 ('your', 'illness'): 9,
 ('wrong', 'in'): 5,
 ('then', 'she'): 22,
 ('looking', 'round'): 9,
 ('where', 'she'): 8,
 ('not', 'delirious'): 4,
 ('head', 'sink'): 4,
 ('black', 'bread'): 4,
 ('still', 'be'): 4,
 ('succeed', 'in'): 4,
 ('he', 'listened'): 12,
 ('dont', 'understand'): 20,
 ('he', 'really'): 10,
 ('need', 'not'): 7,
 ('there', 'you'): 7,
 ('to', 'do'): 104,
 ('seemed', 'to'): 100,
 ('all', 'it'): 6,
 ('you', 'made'): 7,
 ('knew', 'of'): 4,
 ('himself', 'but'): 14,
 ('to', 'return'): 10,
 ('dreaming', 'of'): 6,
 ('brought', 'him'): 8,
 ('will', 'and'): 6,
 ('speak', 'and'): 4,
 ('got', 'into'): 6,
 ('what', 'we'): 8,
 ('had', 'already'): 12,
 ('years', 'and'): 6,
 ('character', 'and'): 7,
 ('can', 'always'): 4,
 ('need', 'of'): 13,
 ('and', 'secondly'): 7,
 ('understand', 'how'): 5,
 ('and', 'fifteen'): 4,
 ('remember', 'what'): 7,
 ('lived', 'in'): 6,
 ('the', 'murdered'): 4,
 ('was', 'given'): 4,
 ('the', 'chair'): 10,
 ('i', 'wanted'): 28,
 ('at', 'night'): 18,
 ('the', 'fact'): 23,
 ('leads', 'to'): 4,
 ('not', 'answer'): 7,
 ('asked', 'raskolnikov'): 9,
 ('always', 'be'): 4,
 ('corner', 'of'): 8,
 ('i', 'trust'): 5,
 ('years', 'in'): 5,
 ('while', 'he'): 20,
 ('have', 'told'): 7,
 ('not', 'tell'): 7,
 ('to', 'pyotr'): 8,
 ('i', 'found'): 8,
 ('then', 'why'): 7,
 ('long', 'before'): 8,
 ('morning', 'at'): 4,
 ('ilya', 'petrovitch'): 32,
 ('every', 'word'): 10,
 ('id', 'been'): 6,
 ('that', 'question'): 9,
 ('them', 'i'): 17,
 ('street', 'it'): 4,
 ('ideas', 'and'): 4,
 ('dont', 'care'): 12,
 ('has', 'come'): 9,
 ('chapter', 'iv'): 6,
 ('asked', 'her'): 8,
 ('the', 'sly'): 5,
 ('remembered', 'that'): 11,
 ('flashing', 'eyes'): 4,
 ('are', 'going'): 10,
 ('another', 'he'): 5,
 ('got', 'hold'): 4,
 ('avdotya', 'romanovnas'): 6,
 ('nonsense', 'he'): 5,
 ('whether', 'you'): 8,
 ('are', 'many'): 4,
 ('of', 'real'): 4,
 ('it', 'must'): 29,
 ('said', 'addressing'): 7,
 ('may', 'not'): 9,
 ('didnt', 'want'): 7,
 ('smile', 'as'): 4,
 ('movement', 'and'): 5,
 ('there', 'but'): 5,
 ('he', 'worried'): 4,
 ('the', 'whip'): 4,
 ('does', 'she'): 8,
 ('the', 'purse'): 14,
 ('to', 'carry'): 9,
 ('in', 'bed'): 4,
 ('thought', 'i'): 11,
 ('arms', 'and'): 4,
 ('more', 'i'): 8,
 ('back', 'and'): 14,
 ('myself', 'up'): 6,
 ('have', 'thought'): 9,
 ('thinking', 'and'): 5,
 ('white', 'as'): 5,
 ('and', 'contemptuously'): 4,
 ('almost', 'in'): 9,
 ('were', 'shaking'): 4,
 ('rid', 'of'): 13,
 ('the', 'lower'): 4,
 ('there', 'have'): 4,
 ('going', 'in'): 7,
 ('they', 'know'): 8,
 ('out', 'again'): 5,
 ('the', 'article'): 5,
 ('didnt', 'mean'): 4,
 ('get', 'round'): 4,
 ('though', 'he'): 86,
 ('was', 'my'): 5,
 ('could', 'never'): 10,
 ('his', 'plans'): 4,
 ('herself', 'for'): 5,
 ('this', 'had'): 4,
 ('expressed', 'it'): 4,
 ('our', 'first'): 4,
 ('dont', 'be'): 37,
 ('too', 'you'): 5,
 ('it', 'when'): 10,
 ('off', 'to'): 11,
 ('time', 'ago'): 4,
 ('government', 'quarters'): 4,
 ('raskolnikov', 'cried'): 9,
 ('that', 'before'): 4,
 ('where', 'to'): 4,
 ('me', 'so'): 11,
 ('him', 'so'): 17,
 ('cried', 'raskolnikov'): 12,
 ('window', 'and'): 14,
 ('he', 'sat'): 19,
 ('make', 'up'): 7,
 ('house', 'was'): 5,
 ('it', 'does'): 10,
 ('almost', 'aloud'): 4,
 ('razumihin', 'shouted'): 6,
 ('goodbye', 'and'): 4,
 ('the', 'stone'): 10,
 ('and', 'without'): 11,
 ('take', 'you'): 9,
 ('answered', 'svidrigaïlov'): 4,
 ('all', 'is'): 11,
 ('holding', 'out'): 8,
 ('he', 'hid'): 4,
 ('have', 'the'): 27,
 ('all', 'such'): 4,
 ('last', 'moment'): 5,
 ('after', 'dinner'): 5,
 ('the', 'road'): 13,
 ('addressing', 'razumihin'): 4,
 ('word', 'of'): 8,
 ('ive', 'got'): 6,
 ('will', 'be'): 96,
 ('what', 'that'): 4,
 ('here', 'i'): 23,
 ('a', 'german'): 6,
 ('where', 'they'): 5,
 ('here', 'a'): 4,
 ('and', 'rather'): 10,
 ('trying', 'to'): 40,
 ('to', 'your'): 27,
 ('zametov', 'is'): 5,
 ('eyes', 'which'): 4,
 ('want', 'you'): 6,
 ('why', 'am'): 16,
 ('short', 'a'): 4,
 ('to', 'myself'): 13,
 ('might', 'not'): 9,
 ('you', 'must'): 55,
 ('and', 'he'): 241,
 ('him', 'more'): 7,
 ('his', 'lodgings'): 4,
 ('say', 'goodbye'): 5,
 ('up', 'at'): 16,
 ('you', 'now'): 13,
 ('somewhere', 'to'): 5,
 ('to', 'accept'): 8,
 ('i', 'only'): 33,
 ('that', 'sort'): 10,
 ('to', 'talk'): 20,
 ('a', 'high'): 5,
 ('last', 'she'): 4,
 ('his', 'recent'): 6,
 ('room', 'like'): 4,
 ('him', 'at'): 34,
 ('raskolnikov', 'answered'): 15,
 ('by', 'and'): 4,
 ('he', 'did'): 91,
 ('want', 'me'): 4,
 ('i', 'right'): 4,
 ('seen', 'and'): 5,
 ('no', 'facts'): 5,
 ('away', 'i'): 7,
 ('me', 'what'): 15,
 ('the', 'man'): 66,
 ('sir', 'he'): 5,
 ('she', 'wont'): 5,
 ('man', 'who'): 33,
 ('mine', 'and'): 4,
 ('read', 'the'): 4,
 ('her', 'in'): 30,
 ('i', 'wasnt'): 4,
 ('going', 'into'): 5,
 ('to', 'sing'): 8,
 ('a', 'step'): 10,
 ('us', 'a'): 5,
 ('your', 'opinion'): 4,
 ('the', 'better'): 13,
 ('a', 'fearful'): 7,
 ('hehehe', 'you'): 4,
 ('an', 'honest'): 5,
 ('to', 'believe'): 9,
 ('this', 'i'): 10,
 ('with', 'one'): 7,
 ('the', 'heart'): 11,
 ('his', 'arms'): 10,
 ('my', 'friend'): 10,
 ('hotly', 'and'): 5,
 ('its', 'nonsense'): 5,
 ('passed', 'between'): 7,
 ('if', 'they'): 22,
 ('of', 'tea'): 6,
 ('drew', 'him'): 5,
 ('left', 'in'): 8,
 ('very', 'difficult'): 4,
 ('suddenly', 'as'): 4,
 ('me', 'up'): 5,
 ('idea', 'struck'): 9,
 ('thinking', 'of'): 19,
 ('you', 'care'): 6,
 ('nonsense', 'i'): 5,
 ('of', 'which'): 22,
 ('another', 'i'): 4,
 ('if', 'that'): 6,
 ('the', 'accident'): 4,
 ('honourable', 'house'): 6,
 ('down', 'with'): 10,
 ('moment', 'later'): 6,
 ('shall', 'see'): 14,
 ('whats', 'this'): 5,
 ('wrapped', 'in'): 5,
 ('own', 'accord'): 5,
 ('a', 'tall'): 4,
 ('with', 'some'): 22,
 ('ill', 'make'): 6,
 ('seven', 'years'): 14,
 ('wrong', 'with'): 4,
 ('his', 'feet'): 7,
 ('be', 'ready'): 5,
 ('must', 'talk'): 4,
 ('him', 'when'): 7,
 ('the', 'conviction'): 4,
 ('raskolnikov', 'made'): 6,
 ('now', 'to'): 18,
 ('you', 'if'): 7,
 ('were', 'they'): 4,
 ('the', 'paper'): 20,
 ('day', 'the'): 5,
 ('in', 'delirium'): 6,
 ('it', 'said'): 7,
 ('has', 'only'): 5,
 ('been', 'too'): 4,
 ('about', 'her'): 29,
 ('cut', 'the'): 4,
 ('used', 'to'): 59,
 ('muttered', 'the'): 4,
 ('for', 'us'): 13,
 ('tell', 'you'): 112,
 ('had', 'two'): 7,
 ('youd', 'better'): 13,
 ('be', 'anxious'): 4,
 ('afanasy', 'ivanovitch'): 5,
 ('the', 'crowd'): 43,
 ('my', 'first'): 7,
 ('alexandrovna', 'was'): 15,
 ('man', 'again'): 4,
 ('all', 'right'): 31,
 ('set', 'of'): 4,
 ('open', 'and'): 12,
 ('haste', 'and'): 7,
 ('her', 'husband'): 14,
 ('them', 'the'): 12,
 ('felt', 'so'): 4,
 ('goodness', 'knows'): 6,
 ('not', 'this'): 5,
 ('so', 'far'): 11,
 ('consider', 'that'): 4,
 ('but', 'another'): 4,
 ('know', 'all'): 15,
 ('brother', 'he'): 7,
 ('hour', 'before'): 4,
 ('the', 'string'): 5,
 ('shall', 'i'): 29,
 ('understand', 'it'): 15,
 ('to', 'become'): 5,
 ('and', 'make'): 15,
 ('shoulders', 'and'): 4,
 ('the', 'gate'): 13,
 ('voice', 'and'): 4,
 ('can', 'we'): 4,
 ('a', 'cardsharper'): 4,
 ('chair', 'and'): 8,
 ('his', 'way'): 34,
 ('all', 'sorts'): 20,
 ('took', 'out'): 8,
 ('off', 'i'): 6,
 ('came', 'upon'): 6,
 ('of', 'marriage'): 4,
 ('table', 'he'): 4,
 ('more', 'honourable'): 5,
 ('for', 'instance'): 36,
 ('left', 'the'): 14,
 ('him', 'who'): 5,
 ('tears', 'and'): 5,
 ('was', 'better'): 6,
 ('was', 'dark'): 5,
 ('at', 'finding'): 4,
 ('rodion', 'romanovitch'): 86,
 ('isnt', 'it'): 17,
 ('glass', 'of'): 10,
 ('he', 'always'): 7,
 ('least', 'he'): 8,
 ('purpose', 'to'): 12,
 ('a', 'relation'): 7,
 ('is', 'my'): 14,
 ('proof', 'of'): 4,
 ('her', 'heart'): 13,
 ('rouble', 'and'): 8,
 ('is', 'now'): 5,
 ('a', 'nice'): 10,
 ('as', 'possible'): 23,
 ('the', 'position'): 8,
 ('broke', 'off'): 6,
 ('be', 'alone'): 6,
 ('keeping', 'with'): 6,
 ('to', 'defend'): 4,
 ('that', 'had'): 23,
 ('it', 'and'): 126,
 ('murder', 'and'): 7,
 ('not', 'what'): 7,
 ('of', 'taking'): 4,
 ('said', 'suddenly'): 13,
 ('a', 'cat'): 4,
 ('great', 'thing'): 6,
 ('mother', 'said'): 7,
 ('he', 'saw'): 41,
 ('intelligent', 'man'): 4,
 ('his', 'pale'): 4,
 ('right', 'side'): 4,
 ('am', 'the'): 7,
 ('both', 'sides'): 5,
 ('washing', 'the'): 4,
 ('come', 'rodya'): 4,
 ('state', 'of'): 13,
 ('idea', 'of'): 16,
 ('illness', 'he'): 4,
 ('think', 'you'): 12,
 ('a', 'crime'): 9,
 ('please', 'do'): 4,
 ('gentleman', 'was'): 5,
 ('better', 'and'): 4,
 ('whether', 'i'): 11,
 ('besides', 'he'): 4,
 ('to', 'explain'): 15,
 ('first', 'place'): 15,
 ('next', 'day'): 19,
 ('a', 'place'): 4,
 ('way', 'but'): 4,
 ('white', 'and'): 7,
 ('the', 'passersby'): 6,
 ('dirty', 'and'): 6,
 ('her', 'to'): 44,
 ('set', 'off'): 8,
 ('moment', 'a'): 4,
 ('in', 'their'): 22,
 ('which', 'had'): 17,
 ('is', 'true'): 7,
 ('i', 'agree'): 5,
 ('and', 'contempt'): 4,
 ('fever', 'he'): 4,
 ('get', 'her'): 6,
 ('be', 'the'): 29,
 ('in', 'every'): 11,
 ('a', 'fresh'): 4,
 ('not', 'more'): 9,
 ('though', 'in'): 11,
 ('walk', 'about'): 6,
 ('well', 'brother'): 5,
 ('they', 'heard'): 4,
 ('had', 'seen'): 24,
 ('ago', 'i'): 9,
 ('and', 'still'): 10,
 ('herself', 'to'): 6,
 ('their', 'eyes'): 8,
 ('so', 'i'): 33,
 ('your', 'sister'): 25,
 ('at', 'its'): 4,
 ('axe', 'and'): 9,
 ('showed', 'him'): 4,
 ('do', 'she'): 4,
 ('you', 'go'): 26,
 ('i', 'dont'): 163,
 ('would', 'he'): 6,
 ('i', 'to'): 33,
 ('a', 'cry'): 6,
 ('interrupted', 'with'): 7,
 ('besides', 'the'): 4,
 ('an', 'hour'): 45,
 ('that', 'as'): 11,
 ('its', 'being'): 5,
 ('more', 'to'): 8,
 ('was', 'completely'): 4,
 ('mother', 'he'): 7,
 ('up', 'that'): 5,
 ('not', 'made'): 4,
 ('his', 'elbow'): 5,
 ('you', 'only'): 8,
 ('opened', 'the'): 23,
 ('men', 'and'): 9,
 ('porfiry', 'but'): 4,
 ('indignation', 'at'): 4,
 ('svidrigaïlov', 'had'): 8,
 ('was', 'doing'): 12,
 ('they', 'went'): 6,
 ('his', 'illness'): 11,
 ('just', 'come'): 13,
 ('it', 'because'): 8,
 ('house', 'he'): 4,
 ('looking', 'down'): 6,
 ('table', 'in'): 7,
 ('very', 'young'): 4,
 ('plans', 'and'): 5,
 ('round', 'the'): 11,
 ('them', 'they'): 11,
 ('to', 'utter'): 5,
 ('fury', 'and'): 4,
 ('a', 'future'): 4,
 ('have', 'liked'): 5,
 ('down', 'and'): 18,
 ('alone', 'and'): 8,
 ('see', 'him'): 24,
 ('minutes', 'and'): 6,
 ('but', 'is'): 5,
 ('in', 'you'): 7,
 ('and', 'night'): 5,
 ('my', 'illness'): 4,
 ('his', 'senses'): 6,
 ('from', 'one'): 10,
 ('a', 'shock'): 4,
 ('your', 'brother'): 11,
 ('that', 'his'): 35,
 ('understand', 'the'): 5,
 ('in', 'keeping'): 5,
 ('have', 'only'): 14,
 ('you', 'in'): 25,
 ('elbow', 'on'): 5,
 ('into', 'his'): 40,
 ('and', 'also'): 7,
 ('i', 'was'): 193,
 ('the', 'form'): 4,
 ('get', 'married'): 6,
 ('had', 'an'): 4,
 ('them', 'sonia'): 4,
 ('and', 'put'): 17,
 ('of', 'all'): 53,
 ('ive', 'had'): 6,
 ('and', 'apparently'): 4,
 ('answer', 'and'): 4,
 ('see', 'clearly'): 4,
 ('hands', 'as'): 4,
 ('a', 'genuine'): 4,
 ('a', 'stone'): 10,
 ('give', 'it'): 13,
 ('they', 'came'): 9,
 ('met', 'him'): 8,
 ('very', 'likely'): 11,
 ('assistant', 'superintendent'): 8,
 ('speak', 'but'): 7,
 ('he', 'showed'): 5,
 ('in', 'i'): 14,
 ('right', 'he'): 4,
 ('eyes', 'met'): 4,
 ('but', 'youre'): 4,
 ('my', 'word'): 20,
 ('the', 'ceiling'): 4,
 ('ran', 'down'): 8,
 ('in', 'too'): 7,
 ('me', 'im'): 4,
 ('rest', 'on'): 4,
 ('that', 'was'): 67,
 ('is', 'there'): 23,
 ('it', 'in'): 40,
 ('and', 'be'): 6,
 ('off', 'at'): 5,
 ('the', 'name'): 5,
 ('go', 'home'): 6,
 ('the', 'railing'): 5,
 ('who', 'killed'): 5,
 ('would', 'do'): 5,
 ('happiness', 'of'): 7,
 ('one', 'moment'): 7,
 ('now', 'he'): 42,
 ('find', 'her'): 4,
 ('and', 'had'): 55,
 ('away', 'the'): 15,
 ('story', 'of'): 8,
 ('she', 'took'): 10,
 ('him', 'this'): 10,
 ('me', 'this'): 8,
 ('can', 'she'): 5,
 ('round', 'him'): 9,
 ('will', 'begin'): 4,
 ('with', 'three'): 5,
 ('asked', 'himself'): 6,
 ('would', 'be'): 106,
 ('havent', 'i'): 4,
 ('you', 'of'): 7,
 ('much', 'so'): 9,
 ('the', 'arm'): 4,
 ('such', 'cases'): 4,
 ('though', 'katerina'): 5,
 ('raskolnikov', 'stopped'): 5,
 ('cant', 'be'): 18,
 ('an', 'absurd'): 4,
 ('like', 'the'): 13,
 ('him', 'let'): 4,
 ('better', 'for'): 11,
 ('ask', 'forgiveness'): 4,
 ('especially', 'in'): 5,
 ('the', 'islands'): 4,
 ('no', 'thats'): 8,
 ('had', 'known'): 8,
 ('such', 'things'): 4,
 ('case', 'with'): 4,
 ('it', 'is'): 161,
 ('before', 'i'): 5,
 ('to', 'understand'): 16,
 ('heard', 'all'): 4,
 ('was', 'all'): 27,
 ('sign', 'of'): 6,
 ('was', 'full'): 4,
 ('for', 'having'): 7,
 ('since', 'then'): 4,
 ('sort', 'i'): 4,
 ('very', 'much'): 33,
 ('could', 'only'): 14,
 ('with', 'nervous'): 4,
 ('far', 'away'): 4,
 ('sonia', 'stood'): 4,
 ('bed', 'with'): 4,
 ('fear', 'of'): 6,
 ('thought', 'to'): 9,
 ('anxiety', 'and'): 7,
 ('till', 'we'): 7,
 ('money', 'in'): 5,
 ('stopped', 'short'): 13,
 ('although', 'i'): 5,
 ('your', 'mother'): 16,
 ('one', 'and'): 11,
 ('breath', 'of'): 4,
 ('quite', 'in'): 4,
 ('shall', 'never'): 6,
 ('now', 'i'): 53,
 ('i', 'simply'): 14,
 ('today', 'that'): 5,
 ('every', 'step'): 5,
 ('said', 'in'): 10,
 ('to', 'get'): 76,
 ('she', 'gave'): 13,
 ('turned', 'round'): 8,
 ('present', 'from'): 5,
 ('nothing', 'in'): 9,
 ('me', 'but'): 36,
 ('is', 'a'): 137,
 ('a', 'level'): 6,
 ('for', 'everything'): 6,
 ('do', 'they'): 6,
 ('it', 'out'): 25,
 ('o', 'u'): 16,
 ('he', 'gazed'): 11,
 ('he', 'ought'): 10,
 ('hold', 'out'): 4,
 ('two', 'hours'): 4,
 ('social', 'position'): 4,
 ('he', 'whispered'): 8,
 ('another', 'and'): 7,
 ('me', 'of'): 8,
 ('since', 'i'): 9,
 ('sank', 'into'): 19,
 ('a', 'pity'): 8,
 ('yesterday', 'at'): 4,
 ('thats', 'it'): 14,
 ('in', 'moscow'): 6,
 ('but', 'of'): 6,
 ('up', 'she'): 4,
 ('and', 'can'): 7,
 ('felt', 'a'): 18,
 ('lay', 'the'): 4,
 ('or', 'whether'): 4,
 ('out', 'to'): 22,
 ('possible', 'to'): 9,
 ('talked', 'of'): 6,
 ('shouted', 'razumihin'): 5,
 ('eyes', 'in'): 6,
 ('it', 'really'): 5,
 ('illness', 'was'): 4,
 ('but', 'after'): 4,
 ('an', 'important'): 7,
 ('ashamed', 'and'): 4,
 ('no', 'interest'): 4,
 ('looked', 'in'): 8,
 ('could', 'be'): 41,
 ('there', 'and'): 17,
 ('paid', 'for'): 7,
 ('could', 'you'): 17,
 ('each', 'other'): 28,
 ('i', 'knew'): 27,
 ('yes', 'theres'): 5,
 ('a', 'dozen'): 5,
 ('legal', 'marriage'): 4,
 ('doing', 'it'): 5,
 ('to', 'know'): 44,
 ('a', 'doctor'): 12,
 ('with', 'blood'): 13,
 ('they', 'sat'): 4,
 ('talking', 'in'): 6,
 ('thought', 'as'): 4,
 ('that', 'he'): 335,
 ('the', 'prison'): 6,
 ('uneasily', 'at'): 4,
 ('perhaps', 'the'): 4,
 ('oh', 'thats'): 6,
 ('more', 'of'): 4,
 ('got', 'a'): 10,
 ('look', 'here'): 5,
 ('there', 'had'): 8,
 ('them', 'so'): 9,
 ('or', 'less'): 5,
 ('does', 'it'): 20,
 ('not', 'ill'): 4,
 ('last', 'to'): 4,
 ('had', 'just'): 34,
 ('youre', 'right'): 6,
 ('i', 'would'): 38,
 ('i', 'used'): 10,
 ('were', 'standing'): 4,
 ('name', 'of'): 6,
 ('pay', 'him'): 4,
 ('cried', 'dounia'): 12,
 ('he', 'wont'): 7,
 ('kissing', 'her'): 4,
 ('answer', 'he'): 6,
 ('and', 'both'): 8,
 ('been', 'taken'): 7,
 ('could', 'hear'): 10,
 ('covered', 'with'): 21,
 ('my', 'god'): 9,
 ('have', 'something'): 6,
 ('he', 'made'): 29,
 ('his', 'shoulder'): 5,
 ('not', 'remember'): 5,
 ('it', 'thats'): 15,
 ('cry', 'of'): 4,
 ('heard', 'it'): 12,
 ('crossed', 'the'): 4,
 ('yes', 'that'): 4,
 ('that', 'the'): 109,
 ('you', 'who'): 9,
 ('before', 'in'): 6,
 ('what', 'what'): 7,
 ('mr', 'lebeziatnikov'): 16,
 ('no', 'business'): 4,
 ('almost', 'the'): 5,
 ('day', 'at'): 5,
 ('yes', 'indeed'): 4,
 ('what', 'sort'): 8,
 ('if', 'id'): 5,
 ('both', 'his'): 4,
 ('to', 'give'): 40,
 ('me', 'for'): 32,
 ('was', 'it'): 42,
 ('way', 'in'): 8,
 ('he', 'shouted'): 27,
 ('doubt', 'he'): 4,
 ('a', 'change'): 4,
 ('youve', 'heard'): 4,
 ('porfiry', 'and'): 6,
 ('are', 'two'): 4,
 ('door', 'opened'): 9,
 ('you', 'suppose'): 11,
 ('sent', 'a'): 6,
 ('amalia', 'ludwigovna'): 8,
 ('the', 'cemetery'): 6,
 ('get', 'away'): 10,
 ('sister', 'i'): 5,
 ('a', 'general'): 6,
 ('he', 'let'): 4,
 ('ive', 'been'): 43,
 ('seat', 'and'): 6,
 ('happened', 'i'): 4,
 ('at', 'our'): 4,
 ('tell', 'him'): 4,
 ('hour', 'later'): 8,
 ('svidrigaïlov', 'with'): 5,
 ('i', 'told'): 31,
 ('im', 'a'): 4,
 ('he', 'smiled'): 5,
 ('murdered', 'her'): 4,
 ('see', 'that'): 26,
 ('to', 'face'): 9,
 ('month', 'ago'): 7,
 ('with', 'hatred'): 4,
 ('besides', 'i'): 9,
 ('something', 'from'): 6,
 ('why', 'it'): 8,
 ('the', 'little'): 29,
 ('subject', 'and'): 4,
 ('is', 'something'): 5,
 ('semyon', 'zaharovitch'): 10,
 ('he', 'stopped'): 14,
 ('himself', 'with'): 10,
 ('silence', 'followed'): 4,
 ('you', 'gave'): 7,
 ('so', 'that'): 94,
 ('his', 'seat'): 6,
 ('on', 'one'): 16,
 ('so', 'the'): 9,
 ('up', 'everything'): 7,
 ('the', 'bridge'): 19,
 ('times', 'when'): 4,
 ('coming', 'here'): 7,
 ('keep', 'him'): 5,
 ('you', 'porfiry'): 5,
 ('the', 'sounds'): 4,
 ('second', 'storey'): 4,
 ('her', 'daughter'): 12,
 ('left', 'alone'): 8,
 ('heart', 'as'): 4,
 ('listening', 'to'): 7,
 ('for', 'herself'): 5,
 ('sometimes', 'he'): 5,
 ('found', 'a'): 9,
 ('door', 'was'): 18,
 ('i', 'assure'): 26,
 ('bound', 'to'): 11,
 ('beginning', 'to'): 13,
 ('followed', 'her'): 5,
 ('sensible', 'woman'): 4,
 ('think', 'i'): 12,
 ('the', 'canal'): 21,
 ('are', 'at'): 4,
 ('only', 'wanted'): 6,
 ('he', 'didnt'): 6,
 ('he', 'almost'): 8,
 ('believe', 'me'): 21,
 ('one', 'cant'): 7,
 ('call', 'me'): 4,
 ('this', 'way'): 12,
 ('of', 'yours'): 15,
 ('sent', 'me'): 7,
 ('of', 'fever'): 4,
 ('we', 'to'): 7,
 ('was', 'even'): 9,
 ('how', 'a'): 5,
 ('a', 'thousand'): 14,
 ...}